EDA

Import the dataset, explore and summarize it



In [11]:

    
# load the necessary python modules
import matplotlib.pyplot as plt
import matplotlib
import pickle
import pandas as pd 
import numpy as np
from IPython.display import display
%matplotlib notebook



In [12]:

    
### Load the dictionary containing the dataset. This code taken from poi_id.py script provided by udacity. 
with open("final_project_dataset.pkl", "r") as data_file:
    data_dict = pickle.load(data_file)



In [13]:

    
# get some initial stats for the project report.
print("Total Number of persons: %d"%len(data_dict.keys()))
print("Total Number of features: %d"%len(list(data_dict.values())[0]))
print("Total Number of POIs: %d"%sum([1 if x['poi'] else 0 for x in data_dict.values()]))









    



Total Number of persons: 146
Total Number of features: 21
Total Number of POIs: 18



In [14]:

    
print data_dict.keys()









    



['METTS MARK', 'BAXTER JOHN C', 'ELLIOTT STEVEN', 'CORDES WILLIAM R', 'HANNON KEVIN P', 'MORDAUNT KRISTINA M', 'MEYER ROCKFORD G', 'MCMAHON JEFFREY', 'HORTON STANLEY C', 'PIPER GREGORY F', 'HUMPHREY GENE E', 'UMANOFF ADAM S', 'BLACHMAN JEREMY M', 'SUNDE MARTIN', 'GIBBS DANA R', 'LOWRY CHARLES P', 'COLWELL WESLEY', 'MULLER MARK S', 'JACKSON CHARLENE R', 'WESTFAHL RICHARD K', 'WALTERS GARETH W', 'WALLS JR ROBERT H', 'KITCHEN LOUISE', 'CHAN RONNIE', 'BELFER ROBERT', 'SHANKMAN JEFFREY A', 'WODRASKA JOHN', 'BERGSIEKER RICHARD P', 'URQUHART JOHN A', 'BIBI PHILIPPE A', 'RIEKER PAULA H', 'WHALEY DAVID A', 'BECK SALLY W', 'HAUG DAVID L', 'ECHOLS JOHN B', 'MENDELSOHN JOHN', 'HICKERSON GARY J', 'CLINE KENNETH W', 'LEWIS RICHARD', 'HAYES ROBERT E', 'MCCARTY DANNY J', 'KOPPER MICHAEL J', 'LEFF DANIEL P', 'LAVORATO JOHN J', 'BERBERIAN DAVID', 'DETMERING TIMOTHY J', 'WAKEHAM JOHN', 'POWERS WILLIAM', 'GOLD JOSEPH', 'BANNANTINE JAMES M', 'DUNCAN JOHN H', 'SHAPIRO RICHARD S', 'SHERRIFF JOHN R', 'SHELBY REX', 'LEMAISTRE CHARLES', 'DEFFNER JOSEPH M', 'KISHKILL JOSEPH G', 'WHALLEY LAWRENCE G', 'MCCONNELL MICHAEL S', 'PIRO JIM', 'DELAINEY DAVID W', 'SULLIVAN-SHAKLOVITZ COLLEEN', 'WROBEL BRUCE', 'LINDHOLM TOD A', 'MEYER JEROME J', 'LAY KENNETH L', 'BUTTS ROBERT H', 'OLSON CINDY K', 'MCDONALD REBECCA', 'CUMBERLAND MICHAEL S', 'GAHN ROBERT S', 'MCCLELLAN GEORGE', 'HERMANN ROBERT J', 'SCRIMSHAW MATTHEW', 'GATHMANN WILLIAM D', 'HAEDICKE MARK E', 'BOWEN JR RAYMOND M', 'GILLIS JOHN', 'FITZGERALD JAY L', 'MORAN MICHAEL P', 'REDMOND BRIAN L', 'BAZELIDES PHILIP J', 'BELDEN TIMOTHY N', 'DURAN WILLIAM D', 'THORN TERENCE H', 'FASTOW ANDREW S', 'FOY JOE', 'CALGER CHRISTOPHER F', 'RICE KENNETH D', 'KAMINSKI WINCENTY J', 'LOCKHART EUGENE E', 'COX DAVID', 'OVERDYKE JR JERE C', 'PEREIRA PAULO V. FERRAZ', 'STABLER FRANK', 'SKILLING JEFFREY K', 'BLAKE JR. NORMAN P', 'SHERRICK JEFFREY B', 'PRENTICE JAMES', 'GRAY RODNEY', 'PICKERING MARK R', 'THE TRAVEL AGENCY IN THE PARK', 'NOLES JAMES L', 'KEAN STEVEN J', 'TOTAL', 'FOWLER PEGGY', 'WASAFF GEORGE', 'WHITE JR THOMAS E', 'CHRISTODOULOU DIOMEDES', 'ALLEN PHILLIP K', 'SHARP VICTORIA T', 'JAEDICKE ROBERT', 'WINOKUR JR. HERBERT S', 'BROWN MICHAEL', 'BADUM JAMES P', 'HUGHES JAMES A', 'REYNOLDS LAWRENCE', 'DIMICHELE RICHARD G', 'BHATNAGAR SANJAY', 'CARTER REBECCA C', 'BUCHANAN HAROLD G', 'YEAP SOON', 'MURRAY JULIA H', 'GARLAND C KEVIN', 'DODSON KEITH', 'YEAGER F SCOTT', 'HIRKO JOSEPH', 'DIETRICH JANET R', 'DERRICK JR. JAMES V', 'FREVERT MARK A', 'PAI LOU L', 'BAY FRANKLIN R', 'HAYSLETT RODERICK J', 'FUGH JOHN L', 'FALLON JAMES B', 'KOENIG MARK E', 'SAVAGE FRANK', 'IZZO LAWRENCE L', 'TILNEY ELIZABETH A', 'MARTIN AMANDA K', 'BUY RICHARD B', 'GRAMM WENDY L', 'CAUSEY RICHARD A', 'TAYLOR MITCHELL S', 'DONAHUE JR JEFFREY M', 'GLISAN JR BEN F']



In [15]:

    
# converting the dictionary dataset to a pandas dataframe
enron_df = pd.DataFrame.from_dict(data_dict)
# Removing entries belonging to Total and THE TRAVEL AGENCY IN THE PARK as they are non persons
del enron_df['TOTAL']
del enron_df['THE TRAVEL AGENCY IN THE PARK']
enron_df = enron_df.transpose()

enron_df_num = enron_df.apply(pd.to_numeric, errors='coerce')
# Removing the email_address from the dataset as it's non-numeric feature and won't seem to have much use right now.
del enron_df_num['email_address']

enron_df_num.describe()









    Out[15]:






  
    
      
      bonus
      deferral_payments
      deferred_income
      director_fees
      exercised_stock_options
      expenses
      from_messages
      from_poi_to_this_person
      from_this_person_to_poi
      loan_advances
      long_term_incentive
      other
      restricted_stock
      restricted_stock_deferred
      salary
      shared_receipt_with_poi
      to_messages
      total_payments
      total_stock_value
    
  
  
    
      count
      8.100000e+01
      3.800000e+01
      4.800000e+01
      16.000000
      1.010000e+02
      94.000000
      86.000000
      86.000000
      86.000000
      3.000000e+00
      6.500000e+01
      9.100000e+01
      1.090000e+02
      1.700000e+01
      9.400000e+01
      86.000000
      86.000000
      1.230000e+02
      1.250000e+02
    
    
      mean
      1.201773e+06
      8.416025e+05
      -5.810498e+05
      89822.875000
      2.959559e+06
      54192.010638
      608.790698
      64.895349
      41.232558
      2.797500e+07
      7.464912e+05
      4.664105e+05
      1.147424e+06
      6.218928e+05
      2.840875e+05
      1176.465116
      2073.860465
      2.641806e+06
      3.352073e+06
    
    
      std
      1.441679e+06
      1.289323e+06
      9.420764e+05
      41112.700735
      5.499450e+06
      46108.377454
      1841.033949
      86.979244
      100.073111
      4.638256e+07
      8.629174e+05
      1.397376e+06
      2.249770e+06
      3.845528e+06
      1.771311e+05
      1178.317641
      2582.700981
      9.524694e+06
      6.532883e+06
    
    
      min
      7.000000e+04
      -1.025000e+05
      -3.504386e+06
      3285.000000
      3.285000e+03
      148.000000
      12.000000
      0.000000
      0.000000
      4.000000e+05
      6.922300e+04
      2.000000e+00
      -2.604490e+06
      -1.787380e+06
      4.770000e+02
      2.000000
      57.000000
      1.480000e+02
      -4.409300e+04
    
    
      25%
      4.250000e+05
      7.964450e+04
      -6.112092e+05
      83674.500000
      5.067650e+05
      22479.000000
      22.750000
      10.000000
      1.000000
      1.200000e+06
      2.750000e+05
      1.203000e+03
      2.520550e+05
      -3.298250e+05
      2.118020e+05
      249.750000
      541.250000
      3.969340e+05
      4.941360e+05
    
    
      50%
      7.500000e+05
      2.210635e+05
      -1.519270e+05
      106164.500000
      1.297049e+06
      46547.500000
      41.000000
      35.000000
      8.000000
      2.000000e+06
      4.221580e+05
      5.158700e+04
      4.410960e+05
      -1.402640e+05
      2.587410e+05
      740.500000
      1211.000000
      1.101393e+06
      1.095040e+06
    
    
      75%
      1.200000e+06
      8.672112e+05
      -3.792600e+04
      112815.000000
      2.542813e+06
      78408.500000
      145.500000
      72.250000
      24.750000
      4.176250e+07
      8.318090e+05
      3.319830e+05
      9.850320e+05
      -7.241900e+04
      3.086065e+05
      1888.250000
      2634.750000
      2.087530e+06
      2.606763e+06
    
    
      max
      8.000000e+06
      6.426990e+06
      -8.330000e+02
      137864.000000
      3.434838e+07
      228763.000000
      14368.000000
      528.000000
      609.000000
      8.152500e+07
      5.145434e+06
      1.035973e+07
      1.476169e+07
      1.545629e+07
      1.111258e+06
      5521.000000
      15149.000000
      1.035598e+08
      4.911008e+07



In [16]:

    
len(enron_df_num)









    Out[16]:





144

We are left with 144 records now in our dataframe.

Also, the summary of the data sets shows some shows a very large standard deviation for some of the features and some missing data for others. We will drop some of these features as below.



In [17]:

    
del enron_df_num['loan_advances']
del enron_df_num['restricted_stock_deferred']
del enron_df_num['director_fees']



In [18]:

    
# Feature selections
data_corr_list = enron_df_num.corr()
print('\nCorrelations between features to POI:\n ' +str(data_corr_list['poi']))









    



Correlations between features to POI:
 bonus                      0.302384
deferral_payments         -0.098428
deferred_income           -0.265698
exercised_stock_options    0.503551
expenses                   0.060292
from_messages             -0.074308
from_poi_to_this_person    0.167722
from_this_person_to_poi    0.112940
long_term_incentive        0.254723
other                      0.120270
poi                        1.000000
restricted_stock           0.224814
salary                     0.264976
shared_receipt_with_poi    0.228313
to_messages                0.058954
total_payments             0.230102
total_stock_value          0.366462
Name: poi, dtype: float64

Features ‘exercised_stock_options’, ‘total_stock_value’, and ‘bonus’ have the highest correlation to POI, in descending order.



In [19]:

    
#Get rid of label
del enron_df_num['poi']
poi = enron_df['poi']

#Create new features
enron_df_num['stock_sum'] = enron_df_num['exercised_stock_options'] +\
                           enron_df_num['total_stock_value'] +\
                           enron_df_num['restricted_stock'] 
enron_df_num['stock_ratio'] = enron_df_num['exercised_stock_options']/enron_df_num['total_stock_value']
enron_df_num['money_total'] = enron_df_num['salary'] +\
                             enron_df_num['bonus'] -\
                             enron_df_num['expenses']
enron_df_num['money_ratio'] = enron_df_num['bonus']/enron_df_num['salary'] 
enron_df_num['email_ratio'] = enron_df_num['from_messages']/(enron_df_num['to_messages']+enron_df_num['from_messages'])
enron_df_num['poi_email_ratio_from'] = enron_df_num['from_poi_to_this_person']/enron_df_num['to_messages']
enron_df_num['poi_email_ratio_to'] = enron_df_num['from_this_person_to_poi']/enron_df_num['from_messages']

#Feel in NA values with 'marker' value outside range of real values
enron_df_num = enron_df_num.fillna(enron_df_num.mean())

#Scale to 1-0
enron_df_num = (enron_df_num-enron_df_num.min())/(enron_df_num.max()-enron_df_num.min())



In [20]:

    
from sklearn.feature_selection import SelectKBest
selector = SelectKBest()
selector.fit(enron_df_num,poi.tolist())
scores = {enron_df_num.columns[i]:selector.scores_[i] for i in range(len(enron_df_num.columns))}
sorted_features = sorted(scores,key=scores.get, reverse=True)
for feature in sorted_features:
    print('Feature %s has value %f'%(feature,scores[feature]))









    



Feature exercised_stock_options has value 29.133390
Feature total_stock_value has value 21.477343
Feature stock_sum has value 15.039523
Feature poi_email_ratio_to has value 13.360475
Feature bonus has value 11.437118
Feature money_total has value 10.334752
Feature salary has value 9.398674
Feature total_payments has value 7.734639
Feature restricted_stock has value 6.853888
Feature long_term_incentive has value 5.964237
Feature shared_receipt_with_poi has value 5.730789
Feature deferred_income has value 5.610048
Feature money_ratio has value 3.895578
Feature from_poi_to_this_person has value 3.036263
Feature email_ratio has value 2.035016
Feature other has value 1.908430
Feature from_this_person_to_poi has value 1.360849
Feature poi_email_ratio_from has value 1.161332
Feature from_messages has value 0.585913
Feature expenses has value 0.478571
Feature deferral_payments has value 0.380285
Feature to_messages has value 0.368235
Feature stock_ratio has value 0.013267



In [21]:

    
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.grid_search import RandomizedSearchCV, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import precision_score, recall_score, accuracy_score
from sklearn.cross_validation import StratifiedShuffleSplit
import scipy
import warnings
warnings.filterwarnings('ignore')

gnb_clf = GridSearchCV(GaussianNB(),{})
#No params to tune for for linear bayes, use for convenience
        
svc_clf = SVC()
svc_search_params = {'C': scipy.stats.expon(scale=1), 
                     'gamma': scipy.stats.expon(scale=.1),
                     'kernel': ['linear','poly','rbf'],
                     'class_weight':['balanced',None]}
svc_search = RandomizedSearchCV(svc_clf, 
                                param_distributions=svc_search_params, 
                                n_iter=25)

tree_clf = DecisionTreeClassifier()
tree_search_params = {'criterion':['gini','entropy'],
                     'max_leaf_nodes':[None,25,50,100,1000],
                     'min_samples_split':[2,3,4],
                     'max_features':[0.25,0.5,0.75,1.0]}
tree_search = GridSearchCV(tree_clf, 
                           tree_search_params,
                           scoring='recall')

search_methods = [gnb_clf,svc_search,tree_search]
average_accuracies = [[0],[0],[0]]
average_precision = [[0],[0],[0]]
average_recall = [[0],[0],[0]]

num_splits = 10
train_split = 0.9
indices = list(StratifiedShuffleSplit(poi.tolist(),
                                      num_splits,
                                      test_size=1-train_split, 
                                      random_state=0))

best_features = None
max_score = 0
best_classifier = None
num_features = 0
for num_features in range(1,len(sorted_features)+1):
    features = sorted_features[:num_features]
    feature_df = enron_df_num[features]
    for classifier_idx in range(3): 
        sum_values = [0,0,0]
        #Only do parameter search once, too wasteful to do a ton
        search_methods[classifier_idx].fit(feature_df.iloc[indices[0][0],:],
                                           poi[indices[0][0]].tolist())
        classifier = search_methods[classifier_idx].best_estimator_
        for split_idx in range(num_splits): 
            train_indices, test_indices = indices[split_idx]
            train_data = (feature_df.iloc[train_indices,:],poi[train_indices].tolist())
            test_data = (feature_df.iloc[test_indices,:],poi[test_indices].tolist())
            classifier.fit(train_data[0],train_data[1])
            predicted = classifier.predict(test_data[0])
            sum_values[0]+=accuracy_score(predicted,test_data[1])
            sum_values[1]+=precision_score(predicted,test_data[1])
            sum_values[2]+=recall_score(predicted,test_data[1])
        avg_acc,avg_prs,avg_recall = [val/num_splits for val in sum_values]
        average_accuracies[classifier_idx].append(avg_acc)
        average_precision[classifier_idx].append(avg_prs)
        average_recall[classifier_idx].append(avg_recall)
        
        score = (avg_prs+avg_recall)/2
        if score>max_score and avg_prs>0.3 and avg_recall>0.3:
            max_score = score
            best_features = features
            best_classifier = search_methods[classifier_idx].best_estimator_
print('Best classifier found is %s \n\
       with score (recall+precision)/2 of %f\n\
       and feature set %s'%(str(best_classifier),max_score,best_features))









    



Best classifier found is DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=0.25, max_leaf_nodes=50, min_samples_leaf=1,
            min_samples_split=4, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best') 
       with score (recall+precision)/2 of 0.420000
       and feature set ['exercised_stock_options', 'total_stock_value', 'stock_sum', 'poi_email_ratio_to', 'bonus', 'money_total']



In [ ]:

	bonus	deferral_payments	deferred_income	director_fees	exercised_stock_options	expenses	from_messages	from_poi_to_this_person	from_this_person_to_poi	loan_advances	long_term_incentive	other	restricted_stock	restricted_stock_deferred	salary	shared_receipt_with_poi	to_messages	total_payments	total_stock_value
count	8.100000e+01	3.800000e+01	4.800000e+01	16.000000	1.010000e+02	94.000000	86.000000	86.000000	86.000000	3.000000e+00	6.500000e+01	9.100000e+01	1.090000e+02	1.700000e+01	9.400000e+01	86.000000	86.000000	1.230000e+02	1.250000e+02
mean	1.201773e+06	8.416025e+05	-5.810498e+05	89822.875000	2.959559e+06	54192.010638	608.790698	64.895349	41.232558	2.797500e+07	7.464912e+05	4.664105e+05	1.147424e+06	6.218928e+05	2.840875e+05	1176.465116	2073.860465	2.641806e+06	3.352073e+06
std	1.441679e+06	1.289323e+06	9.420764e+05	41112.700735	5.499450e+06	46108.377454	1841.033949	86.979244	100.073111	4.638256e+07	8.629174e+05	1.397376e+06	2.249770e+06	3.845528e+06	1.771311e+05	1178.317641	2582.700981	9.524694e+06	6.532883e+06
min	7.000000e+04	-1.025000e+05	-3.504386e+06	3285.000000	3.285000e+03	148.000000	12.000000	0.000000	0.000000	4.000000e+05	6.922300e+04	2.000000e+00	-2.604490e+06	-1.787380e+06	4.770000e+02	2.000000	57.000000	1.480000e+02	-4.409300e+04
25%	4.250000e+05	7.964450e+04	-6.112092e+05	83674.500000	5.067650e+05	22479.000000	22.750000	10.000000	1.000000	1.200000e+06	2.750000e+05	1.203000e+03	2.520550e+05	-3.298250e+05	2.118020e+05	249.750000	541.250000	3.969340e+05	4.941360e+05
50%	7.500000e+05	2.210635e+05	-1.519270e+05	106164.500000	1.297049e+06	46547.500000	41.000000	35.000000	8.000000	2.000000e+06	4.221580e+05	5.158700e+04	4.410960e+05	-1.402640e+05	2.587410e+05	740.500000	1211.000000	1.101393e+06	1.095040e+06
75%	1.200000e+06	8.672112e+05	-3.792600e+04	112815.000000	2.542813e+06	78408.500000	145.500000	72.250000	24.750000	4.176250e+07	8.318090e+05	3.319830e+05	9.850320e+05	-7.241900e+04	3.086065e+05	1888.250000	2634.750000	2.087530e+06	2.606763e+06
max	8.000000e+06	6.426990e+06	-8.330000e+02	137864.000000	3.434838e+07	228763.000000	14368.000000	528.000000	609.000000	8.152500e+07	5.145434e+06	1.035973e+07	1.476169e+07	1.545629e+07	1.111258e+06	5521.000000	15149.000000	1.035598e+08	4.911008e+07